# import packages and modules
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.transforms import Affine2D
import sys
sys.path.insert(1,'/REDACTED/fairness/code/utilities')
import estimators as et
import time
from pandas._libs.lib import is_integer
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
from scipy import stats


## files to read: 

# individual2014 
# ncdata 
# ncweights 

keep_cols = ['taxpayer_id', 'predicted_prob_black', 'isEIC', 'filing_jointly', 'isM', 'pos_deps', 'aud_no_research_audits', 'adj_gross_inc']
individual2014 = pd.read_csv('/REDACTED/data/final/individualBISG2014_full_final.csv', usecols=keep_cols)
ncdata = pd.read_csv("/REDACTED/NC_Analysis_Dataset_2014_tplevel.csv")
ncweights = pd.read_csv("/REDACTED/HertzGraff/nc_reweights_v4_April.csv")

# clean datasets and merge together
ncdata = ncdata[ncdata.black_ind.notna()]
ncdata = ncdata[['taxpayer_id', 'black_ind']]

ncweights = ncweights[['taxpayer_id', 'uswgt']]

individual2014 = pd.merge(individual2014, ncdata, on = 'taxpayer_id', how = 'left')
len(individual2014[individual2014.black_ind.notnull()])
len(individual2014)
individual2014 = pd.merge(individual2014, ncweights, on = 'taxpayer_id', how = 'left')
len(individual2014[individual2014.uswgt.notnull()])
len(individual2014)

# prep dataset for analysis
individual2014['unitwt'] = 1
individual2014['p_black_rd'] = individual2014['predicted_prob_black'].round(2)
individual2014['p_black_rd'] = pd.to_numeric(individual2014['p_black_rd'])
individual2014['predicted_prob_black'] = pd.to_numeric(individual2014['predicted_prob_black'])
individual2014['black_ind'] = pd.to_numeric(individual2014['black_ind'])

######################
### EITC SUBGROUPS ###
######################

# define EITC subgroups to calculate covariance conditions for
varlist = ['Metrics', 'Single EITC',
           'Joint EITC',
           'Single Male EITC',
           'Single Female EITC',
           'Single Male EITC with Deps',
           'Single Male EITC without Deps']
output = pd.DataFrame(columns=varlist)
output['Metrics'] = ['E_cov_cond_b', 'E_cov_cond_b_se', 'E_cov_cond_B', 'E_cov_cond_B_se', 'nc_weights']
filters = [None, (individual2014.isEIC == 1) & ~(individual2014.filing_jointly == 1),
           (individual2014.isEIC == 1) & (individual2014.filing_jointly == 1),
           (individual2014.isEIC == 1) & ~(individual2014.filing_jointly == 1) & (individual2014.isM == 1),
           (individual2014.isEIC == 1) & ~(individual2014.filing_jointly == 1) & ~(individual2014.isM == 1),
           (individual2014.isEIC == 1) & ~(individual2014.filing_jointly == 1) & (individual2014.isM == 1) & (individual2014.pos_deps == 1),
           (individual2014.isEIC == 1) & ~(individual2014.filing_jointly == 1) & (individual2014.isM == 1)  & ~(individual2014.pos_deps == 1)]

# loop over EITC subgroups and calculate covariance conditions
for i in range(len(varlist)):
    start = time.time()
    print(i)
    if varlist[i] == 'Metrics':
        pass
    else:
        column = []
        dat = individual2014[filters[i]]
        dat = dat[(dat.black_ind.notnull()) & (dat.uswgt.notnull())]
        est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='uswgt', outcomevarb='aud_no_research_audits', covarb='black_ind', conditioningvarb='p_black_rd')
        column.append(est*10000)
        column.append(se*10000)
        est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='uswgt', outcomevarb='aud_no_research_audits', covarb='predicted_prob_black', conditioningvarb='black_ind')
        column.append(est*10000)
        column.append(se*10000)
        column.append(1)
        output[varlist[i]] = column
    stop = time.time()
    print(stop-start)

# add sizes of subgroups to dat
lens = []
for i in range(len(filters)):
    print(i)
    if varlist[i] == 'Metrics':
        lens.append("N")
    else:
        dat = individual2014[filters[i]]
        dat = dat[(dat.black_ind.notnull()) & (dat.uswgt.notnull())]
        lens.append(len(dat))
    
output.loc[len(output)] = lens

# option to write out to a LaTeX file
#output.to_latex('/REDACTED/residual_covariance_terms_SE_NC_EITC_subgroups_V6.tex', index=False)

# prep dataset for plottaxpayer_idg
output_massaged = output.set_index('Metrics').transpose().reset_index()
output_massaged = output_massaged.rename(columns={'index':'subgroup'})

# plot covaroance conditions with standard errors
fig,ax = plt.subplots(1,1,sharex = True, sharey = True, figsize=(15,10))
x = output_massaged['subgroup']
y_b = output_massaged['E_cov_cond_b']
yerr_b = output_massaged['E_cov_cond_b_se'] * 1.96
y_B = output_massaged['E_cov_cond_B']
yerr_B = output_massaged['E_cov_cond_B_se'] * 1.96

plt.xticks(rotation=30)

plt.errorbar(x, y_b, yerr=yerr_b, fmt="o", capsize = 9, label = "E[cov(Y,B)|b]")
plt.errorbar(x, y_B, yerr=yerr_B, fmt="o", capsize = 9, label = "E[cov(Y,b)|B]")
plt.xlabel('EITC Subgroup', fontsize=18)
plt.ylabel('Estimated Covariance', fontsize=18)
ax.axhline(y = 0, color = 'red', linestyle = 'dotted')
plt.legend(fontsize=18)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

plt.savefig('/REDACTED/cov_conditions_eitc_subgroups.png', facecolor='white', bbox_inches='tight')
plt.close()

###################
### INCOME BINS ###
###################
# subset to taxpayers with non-negative AGI
dataBISG_noneg_agi = individual2014[individual2014['adj_gross_inc']>=0]

# split into 20 bins based on AGI
n_bin = 20
dataBISG_noneg_agi["bin"] = pd.qcut(dataBISG_noneg_agi["adj_gross_inc"], q = n_bin, labels = list(range(1, n_bin+1)))

dataBISG_noneg_agi.bin.value_counts()
dataBISG_noneg_agi["bin"] = dataBISG_noneg_agi.bin.astype(float).astype('Int64')
bin_20_mean = dataBISG_noneg_agi.groupby('bin')['adj_gross_inc'].mean()

# define income bins to calculate covariance conditions for
varlist = ["Metrics",
          "Bin 1",
          "Bin 2", 
          "Bin 3",
          "Bin 4",
          "Bin 5",
          "Bin 6",
          "Bin 7",
          "Bin 8",
          "Bin 9",
          "Bin 10",
          "Bin 11",
          "Bin 12",
          "Bin 13",
          "Bin 14",
          "Bin 15",
          "Bin 16",
          "Bin 17",
          "Bin 18",
          "Bin 19",
          "Bin 20"]

output = pd.DataFrame(columns=varlist)
output['Metrics'] = ['E_cov_cond_b', 'E_cov_cond_b_se', 'E_cov_cond_B', 'E_cov_cond_B_se', 'nc_weights'] 

filters = [None,
          dataBISG_noneg_agi.bin == 1,
          dataBISG_noneg_agi.bin == 2,
          dataBISG_noneg_agi.bin == 3,
          dataBISG_noneg_agi.bin == 4,
          dataBISG_noneg_agi.bin == 5,
          dataBISG_noneg_agi.bin == 6,
          dataBISG_noneg_agi.bin == 7,
          dataBISG_noneg_agi.bin == 8,
          dataBISG_noneg_agi.bin == 9,
          dataBISG_noneg_agi.bin == 10,
          dataBISG_noneg_agi.bin == 11,
          dataBISG_noneg_agi.bin == 12,
          dataBISG_noneg_agi.bin == 13,
          dataBISG_noneg_agi.bin == 14,
          dataBISG_noneg_agi.bin == 15,
          dataBISG_noneg_agi.bin == 16,
          dataBISG_noneg_agi.bin == 17,
          dataBISG_noneg_agi.bin == 18,
          dataBISG_noneg_agi.bin == 19,
          dataBISG_noneg_agi.bin == 20]

# loop over income bins and compute covariance conditions
lens_bins = []
for i in range(len(varlist)):
    print(i)
    if varlist[i] == 'Metrics':
        lens_bins.append("N")
        pass
    else:
        column = []
        dat = dataBISG_noneg_agi[filters[i]]
        dat = dat[(dat.black_ind.notnull()) & (dat.uswgt.notnull())]
        lens_bins.append(len(dat))
        est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='uswgt', outcomevarb='aud_no_research_audits', covarb='black_ind', conditioningvarb='p_black_rd')
        column.append(est*10000)
        column.append(se*10000)
        est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='uswgt', outcomevarb='aud_no_research_audits', covarb='predicted_prob_black', conditioningvarb='black_ind')
        column.append(est*10000)
        column.append(se*10000)
        column.append(1)
        output[varlist[i]] = column


output.loc[len(output)] = lens_bins

# option to write out to a LaTeX file
#output.to_latex('/REDACTED/residual_covariance_terms_SE_NC_income_bins_V6.tex', index=False)

# prep dataset for plottaxpayer_idg
output_massaged = output.set_index('Metrics').transpose().reset_index()
output_massaged = output_massaged.rename(columns={'index':'bin'})
output_massaged = output_massaged[output_massaged['bin'] != 'Bin 20']

# plot covariance conditions with standard errors
fig,ax = plt.subplots(1,1,sharex = True, sharey = True, figsize=(15,10))
x = output_massaged['bin']
y_b = output_massaged['E_cov_cond_b']
yerr_b = output_massaged['E_cov_cond_b_se'] * 1.96
y_B = output_massaged['E_cov_cond_B']
yerr_B = output_massaged['E_cov_cond_B_se'] * 1.96

plt.errorbar(x, y_b, yerr=yerr_b, fmt="o", capsize = 9, label = "E[cov(Y,B)|b]")
plt.errorbar(x, y_B, yerr=yerr_B, fmt="o", capsize = 9, label = "E[cov(Y,b)|B]")
plt.xlabel('Income Bin', fontsize=18)
plt.ylabel('Estimated Covariance', fontsize=18)
ax.axhline(y = 0, color = 'red', linestyle = 'dotted')
plt.legend(fontsize=18)
plt.xticks(rotation=30, fontsize=18)
plt.yticks(fontsize=18)

plt.savefig('/REDACTED/cov_conditions_income_bins.png', facecolor='white', bbox_inches='tight')
plt.close()